﻿/*
 * The Porter2 stemming algorithm 
 * 
 * author: Kamil Bartocha
 * email: crow2 (at) poczta.fm
 * version: 1.0.0
 * date: 11.04.2007
 * 
 * 
 * This is a C# implementation of the Porter2 algorithm as described on:
 * http://snowball.tartarus.org/algorithms/english/stemmer.html
 * 
 * It is provided as a ready-to-use class with only a single public method:
 * 
 * public string stem(string word)
 * 
 * 
 * Usage example:
 * 
 * stemer = new Porter2();
 * string output = stemer.stem("word");
 * 
 * 
 * If you notice any flaws or ways to improve this source code, or simply decide 
 * to use it in your project, please let me know.  
 * 
 * Notices for users: 
 * 
 * 1. The sb.Replace(...) code might be somehow optimized but this
 * seemed to be the best choice to do it.
 * 
 * 2. I've tried to use the String class more often insted of the sb.ToString(...)
 * but the resulting code was 100% SLOWER! I guess it's because of StringBuilder's 
 * internal optimizations.
 *
 * 
 * LICENSE:
 * 
 * Copyright 2007 Kamil Bartocha. All rights reserved.
 * 
 * Redistribution and use in source and binary forms, 
 * with or without modification, are permitted provided that 
 * the following conditions are met:
 * 
 * 1. Redistributions of source code must retain the above copyright notice, 
 * this list of conditions and the following disclaimer.
 * 
 * 2. Redistributions in binary form must reproduce the above copyright notice, 
 * this list of conditions and the following disclaimer in the documentation 
 * and/or other materials provided with the distribution.
 * 
 * THIS SOFTWARE IS PROVIDED BY KAMIL BARTOCHA ``AS IS'' 
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FREEBSD PROJECT OR CONTRIBUTORS 
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, 
 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT 
 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * 
 * 
 */

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

namespace InformationRetrieval.Normalization
{
    /// <summary>
    /// Provides methods for stemming tokens
    /// </summary>
    public class PorterStemmer : INormalizer
    {
        #region Porter2 internals
        string[] doubles = { "bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt" };
        string[] validLiEndings = { "c", "d", "e", "g", "h", "k", "m", "n", "r", "t" };

        private string[,] step1bReplacements =
        {
            {"eedly","ee"},
            {"ingly",""},
            {"edly",""},
            {"eed","ee"},
            {"ing",""},
            {"ed",""}
        };

        string[,] step2Replacements =
        {
            {"ization","ize"},
            {"iveness","ive"},
            {"fulness","ful"},
            {"ational","ate"},
            {"ousness","ous"},
            {"biliti","ble"},
            {"tional","tion"},
            {"lessli","less"},
            {"fulli","ful"},
            {"entli","ent"},
            {"ation","ate"},
            {"aliti","al"},
            {"iviti","ive"},
            {"ousli","ous"},
            {"alism","al"},
            {"abli","able"},
            {"anci","ance"},
            {"alli","al"},
            {"izer","ize"},
            {"enci","ence"},
            {"ator","ate"},
            {"bli","ble"},
            {"ogi","og"},
            {"li",""}
        };

        string[,] step3Replacements =
        {
            {"ational","ate"},
            {"tional","tion"},
            {"alize","al"},
            {"icate","ic"},
            {"iciti","ic"},
            {"ative",""},
            {"ical","ic"},
            {"ness",""},
            {"ful",""}
        };

        string[] step4Replacements =            
            {"ement",
            "ment",
            "able",
            "ible",
            "ance",
            "ence",
            "ate",
            "iti",
            "ion",
            "ize",
            "ive",
            "ous",
            "ant",
            "ism",
            "ent",
            "al",
            "er",
            "ic"
        };

        string[,] exceptions =
        {
        {"skis","ski"},
        {"skies","sky"},
        {"dying","die"},
        {"lying","lie"},
        {"tying","tie"},
        {"idly","idl"},
        {"gently","gentl"},
        {"ugly","ugli"},
        {"early","earli"},
        {"only","onli"},
        {"singly","singl"},
        {"sky","sky"},
        {"news","news"},
        {"howe","howe"},
        {"atlas","atlas"},
        {"cosmos","cosmos"},
        {"bias","bias"},
        {"andes","andes"}
        };

        string[] exceptions2 =
        {"inning","outing","canning","herring","earring","proceed",
            "exceed","succeed"};


        // A helper table lookup code - used for vowel lookup 
        private bool arrayContains(string[] arr, string s)
        {
            for (int i = 0; i < arr.Length; ++i)
            {
                if (arr[i] == s) return true;
            }
            return false;
        }

        private bool isVowel(StringBuilder s, int offset)
        {
            switch (s[offset])
            {
                case 'a':
                case 'e':
                case 'i':
                case 'o':
                case 'u':
                case 'y':
                    return true;
                    break;
                default:
                    return false;
            }
        }

        private bool isShortSyllable(StringBuilder s, int offset)
        {
            if ((offset == 0) && (isVowel(s, 0)) && (!isVowel(s, 1)))
                return true;
            else
                if (
                    ((offset > 0) && (offset < s.Length - 1)) &&
                    isVowel(s, offset) && !isVowel(s, offset + 1) &&
                    (s[offset + 1] != 'w' && s[offset + 1] != 'x' && s[offset + 1] != 'Y')
                    && !isVowel(s, offset - 1))
                    return true;
                else
                    return false;
        }

        private bool isShortWord(StringBuilder s, int r1)
        {
            if ((r1 >= s.Length) && (isShortSyllable(s, s.Length - 2))) return true;

            return false;
        }

        private void changeY(StringBuilder sb)
        {
            if (sb[0] == 'y') sb[0] = 'Y';

            for (int i = 1; i < sb.Length; ++i)
            {
                if ((sb[i] == 'y') && (isVowel(sb, i - 1))) sb[i] = 'Y';
            }
        }

        private void computeR1R2(StringBuilder sb, ref int r1, ref int r2)
        {
            r1 = sb.Length;
            r2 = sb.Length;

            if ((sb.Length >= 5) && (sb.ToString(0, 5) == "gener" || sb.ToString(0, 5) == "arsen")) r1 = 5;
            if ((sb.Length >= 6) && (sb.ToString(0, 6) == "commun")) r1 = 6;

            if (r1 == sb.Length) // If R1 has not been changed by exception words
                for (int i = 1; i < sb.Length; ++i) // Compute R1 according to the algorithm
                {
                    if ((!isVowel(sb, i)) && (isVowel(sb, i - 1)))
                    {
                        r1 = i + 1;
                        break;
                    }
                }

            for (int i = r1 + 1; i < sb.Length; ++i)
            {
                if ((!isVowel(sb, i)) && (isVowel(sb, i - 1)))
                {
                    r2 = i + 1;
                    break;
                }
            }
        }

        private void step0(StringBuilder sb)
        {

            if ((sb.Length >= 3) && (sb.ToString(sb.Length - 3, 3) == "'s'"))
                sb.Remove(sb.Length - 3, 3);
            else
                if ((sb.Length >= 2) && (sb.ToString(sb.Length - 2, 2) == "'s"))
                    sb.Remove(sb.Length - 2, 2);
                else
                    if (sb[sb.Length - 1] == '\'')
                        sb.Remove(sb.Length - 1, 1);
        }

        private void step1a(StringBuilder sb)
        {

            if ((sb.Length >= 4) && sb.ToString(sb.Length - 4, 4) == "sses")
                sb.Replace("sses", "ss", sb.Length - 4, 4);
            else
                if ((sb.Length >= 3) && (sb.ToString(sb.Length - 3, 3) == "ied" || sb.ToString(sb.Length - 3, 3) == "ies"))
                {
                    if (sb.Length > 4)
                        sb.Replace(sb.ToString(sb.Length - 3, 3), "i", sb.Length - 3, 3);
                    else
                        sb.Replace(sb.ToString(sb.Length - 3, 3), "ie", sb.Length - 3, 3);
                }
                else
                    if ((sb.Length >= 2) && (sb.ToString(sb.Length - 2, 2) == "us" || sb.ToString(sb.Length - 2, 2) == "ss"))
                        return;
                    else
                        if ((sb.Length > 0) && (sb.ToString(sb.Length - 1, 1) == "s"))
                        {
                            for (int i = 0; i < sb.Length - 2; ++i)
                                if (isVowel(sb, i))
                                {
                                    sb.Remove(sb.Length - 1, 1);
                                    break;
                                }
                        }

        }

        private void step1b(StringBuilder sb, int r1)
        {
            for (int i = 0; i < 6; ++i)
            {
                if ((sb.Length > step1bReplacements[i, 0].Length) && (sb.ToString(sb.Length - step1bReplacements[i, 0].Length, step1bReplacements[i, 0].Length) == step1bReplacements[i, 0]))
                {
                    switch (step1bReplacements[i, 0])
                    {
                        case "eedly":
                        case "eed":
                            if (sb.Length - step1bReplacements[i, 0].Length >= r1)
                                sb.Replace(step1bReplacements[i, 0], step1bReplacements[i, 1], sb.Length - step1bReplacements[i, 0].Length, step1bReplacements[i, 0].Length);
                            break;
                        default:
                            bool found = false;
                            for (int j = 0; j < sb.Length - step1bReplacements[i, 0].Length; ++j)
                            {
                                if (isVowel(sb, j))
                                {
                                    sb.Replace(step1bReplacements[i, 0], step1bReplacements[i, 1], sb.Length - step1bReplacements[i, 0].Length, step1bReplacements[i, 0].Length);
                                    found = true;
                                    break;
                                }
                            }
                            if (!found || sb.Length<=1) return;
                            switch (sb.ToString(sb.Length - 2, 2))
                            {
                                case "at":
                                case "bl":
                                case "iz":
                                    sb.Append("e");
                                    return;
                            }
                            if (arrayContains(doubles, sb.ToString(sb.Length - 2, 2)))
                            {
                                sb.Remove(sb.Length - 1, 1);
                                return;
                            }
                            if (isShortWord(sb, r1))
                                sb.Append("e");
                            break;
                    }
                    return;
                }
            }
        }

        private void step1c(StringBuilder sb)
        {
            if ((sb.Length > 0) &&
                (sb[sb.Length - 1] == 'y' || sb[sb.Length - 1] == 'Y') &&
                (sb.Length > 2) && (!isVowel(sb, sb.Length - 2))
               )
                sb[sb.Length - 1] = 'i';
        }

        private void step2(StringBuilder sb, int r1)
        {
            for (int i = 0; i < 24; ++i)
            {
                if (
                    (sb.Length >= step2Replacements[i, 0].Length) &&
                    (sb.ToString(sb.Length - step2Replacements[i, 0].Length, step2Replacements[i, 0].Length) == step2Replacements[i, 0])
                    )
                {
                    if (sb.Length - step2Replacements[i, 0].Length >= r1)
                    {
                        switch (step2Replacements[i, 0])
                        {
                            case "ogi":
                                if ((sb.Length > 3) &&
                                    (sb[sb.Length - step2Replacements[i, 0].Length - 1] == 'l')
                                    )
                                    sb.Replace(step2Replacements[i, 0], step2Replacements[i, 1], sb.Length - step2Replacements[i, 0].Length, step2Replacements[i, 0].Length);
                                return;
                            case "li":
                                if ((sb.Length > 1) &&
                                    (arrayContains(validLiEndings, sb.ToString(sb.Length - 3, 1)))
                                    )
                                    sb.Remove(sb.Length - 2, 2);
                                return;
                            default:
                                sb.Replace(step2Replacements[i, 0], step2Replacements[i, 1], sb.Length - step2Replacements[i, 0].Length, step2Replacements[i, 0].Length);
                                return;
                                break;

                        }
                    }
                    else
                        return;
                }
            }
        }

        private void step3(StringBuilder sb, int r1, int r2)
        {
            for (int i = 0; i < 9; ++i)
            {
                if (
                    (sb.Length >= step3Replacements[i, 0].Length) &&
                    (sb.ToString(sb.Length - step3Replacements[i, 0].Length, step3Replacements[i, 0].Length) == step3Replacements[i, 0])
                    )
                {
                    if (sb.Length - step3Replacements[i, 0].Length >= r1)
                    {
                        switch (step3Replacements[i, 0])
                        {
                            case "ative":
                                if (sb.Length - step3Replacements[i, 0].Length >= r2)
                                    sb.Replace(step3Replacements[i, 0], step3Replacements[i, 1], sb.Length - step3Replacements[i, 0].Length, step3Replacements[i, 0].Length);
                                return;
                            default:
                                sb.Replace(step3Replacements[i, 0], step3Replacements[i, 1], sb.Length - step3Replacements[i, 0].Length, step3Replacements[i, 0].Length);
                                return;
                        }
                    }
                    else return;
                }
            }
        }

        private void step4(StringBuilder sb, int r2)
        {
            for (int i = 0; i < 18; ++i)
            {
                if (
                    (sb.Length >= step4Replacements[i].Length) &&
                    (sb.ToString(sb.Length - step4Replacements[i].Length, step4Replacements[i].Length) == step4Replacements[i])                    // >=
                    )
                {
                    if (sb.Length - step4Replacements[i].Length >= r2)
                    {
                        switch (step4Replacements[i])
                        {
                            case "ion":
                                if (
                                    (sb.Length > 3) &&
                                    (
                                        (sb[sb.Length - step4Replacements[i].Length - 1] == 's') ||
                                        (sb[sb.Length - step4Replacements[i].Length - 1] == 't')
                                    )
                                   )
                                    sb.Remove(sb.Length - step4Replacements[i].Length, step4Replacements[i].Length);
                                return;
                            default:
                                sb.Remove(sb.Length - step4Replacements[i].Length, step4Replacements[i].Length);
                                return;
                        }
                    }
                    else
                        return;
                }
            }

        }

        private void step5(StringBuilder sb, int r1, int r2)
        {
            if (sb.Length > 0)
                if (
                    (sb[sb.Length - 1] == 'e') &&
                    (
                        (sb.Length - 1 >= r2) ||
                        ((sb.Length - 1 >= r1) && (!isShortSyllable(sb, sb.Length - 3)))
                    )
                   )
                    sb.Remove(sb.Length - 1, 1);
                else
                    if (
                        (sb[sb.Length - 1] == 'l') &&
                            (sb.Length - 1 >= r2) &&
                            (sb[sb.Length - 2] == 'l')
                        )
                        sb.Remove(sb.Length - 1, 1);
        }

        private string stem(string word)
        {
            if (word.Length < 3) return word;

            StringBuilder sb = new StringBuilder(word.ToLower());

            if (sb[0] == '\'') sb.Remove(0, 1);

            for (int i = 0; i < exceptions.Length / 2; ++i)
                if (word == exceptions[i, 0])
                    return exceptions[i, 1];

            int r1 = 0, r2 = 0;
            changeY(sb);
            computeR1R2(sb, ref r1, ref r2);

            step0(sb);
            step1a(sb);

            for (int i = 0; i < exceptions2.Length; ++i)
                if (sb.ToString() == exceptions2[i])
                    return exceptions2[i];

            step1b(sb, r1);
            step1c(sb);
            step2(sb, r1);
            step3(sb, r1, r2);
            step4(sb, r2);
            step5(sb, r1, r2);


            return sb.ToString().ToLower();
        }
        #endregion

        #region INormalizer Members

        /// <summary>
        /// Takes an un-stemmed token and returns its stemmed (i.e. truncated) form.
        /// The Porter2 algorithm is used for stemming.
        /// </summary>
        /// <param name="token">An un-stemmed token</param>
        /// <returns>The stemmed token</returns>
        public string Normalize(string token)
        {
            return stem(token);
        }

        #endregion
    }
}
